Predicting with Regression

While easy to implement and interpret, regression can have poor performance in non-linear settings

Data obtention, splitting, exploratory data analysis

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
data(faithful)
set.seed(333)
inTrain<-createDataPartition(y = faithful$waiting, p = 0.5, list = F)
trainFaith<-faithful[inTrain, ]
testFaith<-faithful[-inTrain, ]
head(trainFaith)
##    eruptions waiting
## 6      2.883      55
## 11     1.833      54
## 16     2.167      52
## 19     1.600      52
## 22     1.750      47
## 27     1.967      55
plot(x = trainFaith$waiting, y = trainFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Duration")

plot of chunk unnamed-chunk-1

Building the linear model

# fit linear model
lm1<-lm(formula = eruptions ~ waiting, data = trainFaith)
summary(lm1)
## 
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2699 -0.3479  0.0398  0.3659  1.0502 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.79274    0.22787   -7.87    1e-12 ***
## waiting      0.07390    0.00315   23.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared:  0.803,  Adjusted R-squared:  0.802 
## F-statistic:  551 on 1 and 135 DF,  p-value: <2e-16
plot(x = trainFaith$waiting, y = trainFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Duration")
lines(x = trainFaith$waiting, lm1$fitted, lwd=3, col="red")

plot of chunk unnamed-chunk-2

Predicting a new value

newdata<-data.frame(waiting=80)
predict(object = lm1, newdata = newdata)
##     1 
## 4.119

Plotting predictions on the training and test sets

par(mfrow=c(1,2))
plot(x = trainFaith$waiting,
     y = trainFaith$eruptions, 
     xlab = "Waiting", 
     ylab = "Duration", 
     main = "Training Data", 
     pch=19, col="blue")
lines(x = trainFaith$waiting, 
      y = predict(object = lm1), 
      lwd=3, col="red")
plot(x = testFaith$waiting, 
     y = testFaith$eruptions, 
     xlab = "Waiting", 
     ylab = "Duration", 
     main = "Testing Data", 
     pch=19, col="blue")
lines(x = testFaith$waiting, 
      y = predict(object = lm1, newdata = testFaith), 
      lwd=3, col="red")

plot of chunk unnamed-chunk-4

Get the training and test set errors

# get root mean squared error on the training set
sqrt(
        sum(
                (lm1$fitted-trainFaith$eruptions)^2
                )
        )
## [1] 5.752
# get root mean squared error on the training set
sqrt(
        sum(
                (predict(object = lm1, newdata = testFaith)-testFaith$eruptions)^2
                )
        )
## [1] 5.839

Get Prediction Intervals

pred1<-predict(object = lm1, newdata = testFaith, interval = "prediction")
# order the values for the test data set 
ord<-order(testFaith$waiting)
plot(x = testFaith$waiting, y = testFaith$eruptions, pch=19, col= "blue")
# plot confidence intervals
matlines(x = testFaith$waiting[ord], pred1[ord, ], type = "l", lwd=3)

plot of chunk unnamed-chunk-6

Doing it all in caret

modFit<-train(eruptions ~ waiting, data = trainFaith, method = "lm")
summary(modFit$finalModel)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2699 -0.3479  0.0398  0.3659  1.0502 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.79274    0.22787   -7.87    1e-12 ***
## waiting      0.07390    0.00315   23.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared:  0.803,  Adjusted R-squared:  0.802 
## F-statistic:  551 on 1 and 135 DF,  p-value: <2e-16

Predicting with regression, multiple covariates

Data obtention, splitting and exda

library(ISLR)
library(ggplot2)
data(Wage)
#subset the data set to the part that isn't the variable that we're trying to predict
Wage<-subset(Wage, select = -c(logwage))
summary(Wage)
##       year           age              sex                    maritl    
##  Min.   :2003   Min.   :18.0   1. Male  :3000   1. Never Married: 648  
##  1st Qu.:2004   1st Qu.:33.8   2. Female:   0   2. Married      :2074  
##  Median :2006   Median :42.0                    3. Widowed      :  19  
##  Mean   :2006   Mean   :42.4                    4. Divorced     : 204  
##  3rd Qu.:2008   3rd Qu.:51.0                    5. Separated    :  55  
##  Max.   :2009   Max.   :80.0                                           
##                                                                        
##        race                   education                     region    
##  1. White:2480   1. < HS Grad      :268   2. Middle Atlantic   :3000  
##  2. Black: 293   2. HS Grad        :971   1. New England       :   0  
##  3. Asian: 190   3. Some College   :650   3. East North Central:   0  
##  4. Other:  37   4. College Grad   :685   4. West North Central:   0  
##                  5. Advanced Degree:426   5. South Atlantic    :   0  
##                                           6. East South Central:   0  
##                                           (Other)              :   0  
##            jobclass               health      health_ins        wage      
##  1. Industrial :1544   1. <=Good     : 858   1. Yes:2083   Min.   : 20.1  
##  2. Information:1456   2. >=Very Good:2142   2. No : 917   1st Qu.: 85.4  
##                                                            Median :104.9  
##                                                            Mean   :111.7  
##                                                            3rd Qu.:128.7  
##                                                            Max.   :318.3  
## 
inTrain<-createDataPartition(y = Wage$wage, p = 0.7, list=F)
training<-Wage[inTrain, ]
testing<-Wage[-inTrain, ]
c(dim(training), dim(testing))
## [1] 2102   11  898   11
# view feature plot
featurePlot(x = training[,c("age", "education", "jobclass")], y = training$wage, plot = "pairs")

plot of chunk unnamed-chunk-8

# more exda plots
qplot(x = age, y = wage, data = training, color= jobclass)

plot of chunk unnamed-chunk-8

qplot(x = age, y = wage, data = training, color= factor(education))

plot of chunk unnamed-chunk-8

Fitting a multicovariate linear relationship

modFit<-train(wage ~ age + jobclass + education, method = "lm", data = training)
finMod<-modFit$finalModel
finMod
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Coefficients:
##                   (Intercept)                            age  
##                        60.997                          0.519  
##      `jobclass2. Information`          `education2. HS Grad`  
##                         5.245                         11.347  
##    `education3. Some College`     `education4. College Grad`  
##                        23.762                         36.754  
## `education5. Advanced Degree`  
##                        62.420

Diagnostics

#residuals vs fitted
plot(x = finMod, 1)

plot of chunk unnamed-chunk-10

# colour by vaules not used in the model, to make sure that these are all on the 0 line
qplot(x = finMod$fitted, finMod$residuals, colour=race, data=training)

plot of chunk unnamed-chunk-10

# Very important!!! Plot by index
plot(x = finMod$residuals, pch=19)

plot of chunk unnamed-chunk-10 Plotting the fitted residuals versus the index shows the high residuals at the right, and you can also see a trend wrt the row numbers. Whenever you see a trend with respect to row numbers, it suggest that there is a varaible missing from your model!!

 # plot predicted vs truth in test set. ideally they should be close
pred<-predict(object = modFit, newdata = testing)
qplot(x = wage, y = pred, colour=year, data = testing)

plot of chunk unnamed-chunk-11

Predicting with trees

The basic idea: if you have a bunch of variables that you want to use to predict an outcome, you can take each of those variables and use them to split the outcome into different groups. Then, you can evaluate the homogeneity of each group, and continue to seperate into groups until the groups are homogeneous enough, or small enough, to stop. While this approach is easy to interpret, and gives better performance in non-linear settings, without pruning or cross validatation, it can lead to uncertainty, which is harder to estimate, and the results may be variable.

Basic algorithm:

  1. Start with all the variables in one big group
  2. Find the variable or split that best seperates the outcomes
  3. Divide the data into two leaves on that split, or node
  4. Within in leaf, find the variable that best seperates the outcome
  5. repeat until the groups are too small or sufficiently pure

Example: Predicting the species with the other variables in the Iris data set

As always, begin with data obtention, splitting and exda

data(iris)
names(iris)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## [5] "Species"
table(iris$Species)
## 
##     setosa versicolor  virginica 
##         50         50         50
set.seed(333)
inTrain<-createDataPartition(y = iris$Species, p = 0.7, list=F)
training<-iris[inTrain, ]
testing<-iris[-inTrain, ]
c(dim(training), dim(testing))
## [1] 105   5  45   5

Look for clusters. Note that a linear model may not be appropriate here

qplot(x = Petal.Width, y = Sepal.Width, data = training, colour = Species)

plot of chunk unnamed-chunk-13

You do not have to implement the tree prediction algorithm. Simply call the method “rpart”

modFit<-train(Species ~., method = "rpart", data = training)
## Loading required package: rpart
modFit$finalModel
## n= 105 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 105 70 setosa (0.3333 0.3333 0.3333)  
##   2) Petal.Length< 2.45 35  0 setosa (1.0000 0.0000 0.0000) *
##   3) Petal.Length>=2.45 70 35 versicolor (0.0000 0.5000 0.5000)  
##     6) Petal.Length< 4.95 40  5 versicolor (0.0000 0.8750 0.1250) *
##     7) Petal.Length>=4.95 30  0 virginica (0.0000 0.0000 1.0000) *

Interpreting results: The final model tells you what all the nodes are, how they are split, and the probability of being in each class. For example, in the second line, Petal.Length<2.6, all of the examples with a petal length of 2.6 belong to the species Setosa.

Plotting the tree

plot(x = modFit$finalModel, uniform = T, main = "Classification Tree")
text(modFit$finalModel, use.n=T, all =T, cex= 0.8)

plot of chunk unnamed-chunk-15

A prettier plot

require(rattle)
## Loading required package: rattle
## Rattle: A free graphical interface for data mining with R.
## Version 3.3.0 Copyright (c) 2006-2014 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
fancyRpartPlot(modFit$finalModel)

plot of chunk unnamed-chunk-16

Remember, ultimately we need to predict values. Use predict()

predict(object = modFit, newdata = testing)
##  [1] setosa     setosa     setosa     setosa     setosa     setosa    
##  [7] setosa     setosa     setosa     setosa     setosa     setosa    
## [13] setosa     setosa     setosa     versicolor versicolor versicolor
## [19] versicolor versicolor versicolor versicolor versicolor versicolor
## [25] virginica  versicolor virginica  versicolor versicolor versicolor
## [31] virginica  virginica  virginica  virginica  virginica  virginica 
## [37] virginica  versicolor virginica  virginica  virginica  virginica 
## [43] virginica  virginica  virginica 
## Levels: setosa versicolor virginica

Bagging (bootstrap aggregating)

The basic idea is that when you fit complicated models, when you average those models together, you get a smoother fit that gives you a better balance between balance in your fit and bias.

  1. Take respamples of the data set, and recalculate predictions.
  2. Average the predictions or majority vote

This leads to a similar bias from fitting any one of those model individually, but a reduced variability, because you have averaged those models together. This is most useful for non-linear functions

example: Predicting temperature as a function of ozone

library(ElemStatLearn)
## 
## Attaching package: 'ElemStatLearn'
## 
## The following object is masked _by_ '.GlobalEnv':
## 
##     spam
data(ozone, package = "ElemStatLearn")
ozone<-ozone[order(ozone$ozone),]
head(ozone)
##     ozone radiation temperature wind
## 17      1         8          59  9.7
## 19      4        25          61  9.7
## 14      6        78          57 18.4
## 45      7        48          80 14.3
## 106     7        49          69 10.3
## 7       8        19          61 20.1

The procedure is as follows: Build a bootstrap sample of the data set (ten times), and use this to build a new data set called ozone0, and fit a loess (type of smooth curve) relating temperature to the ozone, wtih a common span. Then predict for every single loess curve an outcome for a new data set for the exact same values.

# construct a matrix to store predictions
ll<-matrix(NA, nrow = 10, ncol = 155)
for (i in (1:nrow(ll))){
        # sample from the oxone data set with replcaement
        ss<-sample(1:dim(ozone)[1], replace = T)
        # assign the sample to a new data set 
        ozone0<-ozone[ss,]
        # order by ozone variable 
        ozone0<-ozone0[order(ozone0$ozone),]
        # fit loess
        loess0<-loess(formula = temperature ~ ozone, data = ozone0, span = 0.2)
        # predict and assign
        ll[i,]<-predict(object = loess0, newdata = data.frame(ozone[1:155,]))
}

Visualization and interpretation

# plot points
plot(x = ozone$ozone, y = ozone$temperature, pch =19, cex =0.5)

plot of chunk unnamed-chunk-20

We want to show the fit with one resampled data set. These are given by the grey lines below

plot(x = ozone$ozone, y = ozone$temperature, pch =19, cex =0.5)
# construct fits
for (i in (1:10)){lines(1:155, ll[i,], col="grey", lwd=2)}
# add fits to plot
lines(x = 1:155, apply(ll, 2, mean), col ="red", lwd=2)

plot of chunk unnamed-chunk-21

The red line is the bagges loess curve.

Bagging in carret is accomplished by setting the method to be bagEarth, treebag, or bagFDA, or you can build your own bagging function.

example: bagging from regression trees

Take your predictor variable an put it into one data frame.

predictors<-data.frame(ozone=ozone$ozone)
head(predictors)
##   ozone
## 1     1
## 2     4
## 3     6
## 4     7
## 5     7
## 6     8

Then, obtain the outcome variable

temperature<-ozone$temperature

Pass these as arguments to the bag() function in the caret package, with the number of subsamples B, and the list of options you want for how to fit the model in bagcontrol

treebag<-bag(x = predictors, 
             y = temperature, B = 10, 
             bagControl = bagControl(fit = ctreeBag$fit, 
                                     predict = ctreeBag$pred, 
                                     aggregate = ctreeBag$aggregate))
## Loading required package: grid
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## Loading required package: strucchange
## Loading required package: modeltools
## Loading required package: stats4
plot(ozone$ozone, ozone$temperature, col="lightgrey", pch=19)
points(ozone$ozone, predict(treebag$fits[[1]]$fit, predictors), pch=19, col="red")
points(ozone$ozone, predict(treebag, predictors), pch=19, col="blue")

plot of chunk unnamed-chunk-25

The grey dots represent the observed values; the red dots are fits from a single conditional regression tree. Note how these do not capture the upward trend very well. Averaging over ten model fits with these regression trees shows the trend (blue)

Random Forests

An extension of bagging. We take a resample of the data set, and at each split, we also bootstrap the variables. It allows us to grow a large number of trees. It is very accurate, but it it slow, and can be difficult to interpret

example Iris data

set.seed(333)
inTrain<-createDataPartition(y = iris$Species, p = 0.7, list = F)
training<-iris[inTrain, ]
testing<-iris[-inTrain, ]
modFit<-train(Species~., method = "rf", data = training, proximity = T)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
modFit
## Random Forest 
## 
## 105 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## 
## Summary of sample sizes: 105, 105, 105, 105, 105, 105, ... 
## 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy  Kappa  Accuracy SD  Kappa SD
##   2     1         0.9    0.04         0.06    
##   3     1         0.9    0.04         0.07    
##   4     1         0.9    0.04         0.07    
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.

Obtain the second tree that was produced with getTree()

getTree(modFit$finalModel, k = 2)
##    left daughter right daughter split var split point status prediction
## 1              2              3         3        4.85      1          0
## 2              4              5         3        2.60      1          0
## 3              6              7         3        4.95      1          0
## 4              0              0         0        0.00     -1          1
## 5              8              9         3        4.70      1          0
## 6             10             11         1        6.60      1          0
## 7              0              0         0        0.00     -1          3
## 8              0              0         0        0.00     -1          2
## 9             12             13         4        1.60      1          0
## 10             0              0         0        0.00     -1          3
## 11             0              0         0        0.00     -1          2
## 12             0              0         0        0.00     -1          2
## 13            14             15         1        6.05      1          0
## 14             0              0         0        0.00     -1          2
## 15             0              0         0        0.00     -1          3

Obtain the center predictions of the classes

irisP<-as.data.frame(
        classCenter(x = training[, c(3, 4)], 
                    label = training$Species,
                    prox = modFit$finalModel$proximity)
        )
irisP$Species<-rownames(irisP)
p<-qplot(x = Petal.Width, y = Petal.Length, data = training, col = Species)
p +geom_point(aes(x = Petal.Width, y = Petal.Length, col=Species), size = 5, shape =4, data = irisP)

plot of chunk unnamed-chunk-28

Predict new values

pred<-predict(object = modFit, newdata = testing)
# check to see if the prediction is correct
predRight<-pred == testing$Species
table(pred, testing$Species)
##             
## pred         setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         13         0
##   virginica       0          2        15

We missed two; to see which these were,

qplot(Petal.Width, Petal.Length, colour = predRight, data = testing)

plot of chunk unnamed-chunk-30

Boosting

Take a large number of possibly weak predictors, weight them and add them up to get a stronger predictor. We could take all possible trees, or all possible regression models, etc.

Example: Wage

data(Wage)
Wage<-subset(Wage, select = -c(logwage))
set.seed(333)
inTrain<-createDataPartition(y = iris$Species, p = 0.7, list = F)
training<-Wage[inTrain,]
testing<-Wage[-inTrain, ]
# use boosting with trees, and suppress output with verbose = F
modFit<-train(wage~., method = "gbm", data=training, verbose =F)
modFit
## Stochastic Gradient Boosting 
## 
## 105 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## 
## Summary of sample sizes: 105, 105, 105, 105, 105, 105, ... 
## 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE  Rsquared  RMSE SD  Rsquared SD
##   1                   50      36    0.3       5        0.08       
##   1                  100      38    0.3       5        0.09       
##   1                  150      38    0.3       5        0.10       
##   2                   50      37    0.3       5        0.09       
##   2                  100      38    0.3       5        0.10       
##   2                  150      38    0.3       5        0.10       
##   3                   50      37    0.3       5        0.09       
##   3                  100      38    0.3       5        0.10       
##   3                  150      38    0.3       5        0.10       
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were n.trees = 50, interaction.depth
##  = 1 and shrinkage = 0.1.

Visualizing the results

qplot(x = predict(object = modFit, newdata = testing), y = wage, data = testing)

plot of chunk unnamed-chunk-32